This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
library(ggplot2)
library(cluster) # clustering algorithms
library(factoextra) # clustering visualization
Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(dendextend) # for comparing two dendrograms
---------------------
Welcome to dendextend version 1.15.2
Type citation('dendextend') for how to cite the package.
Type browseVignettes(package = 'dendextend') for the package vignette.
The github page is: https://github.com/talgalili/dendextend/
Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
You may ask questions at stackoverflow, use the r and dendextend tags:
https://stackoverflow.com/questions/tagged/dendextend
To suppress this message use: suppressPackageStartupMessages(library(dendextend))
---------------------
Attaching package: ‘dendextend’
The following object is masked from ‘package:stats’:
cutree
library(tidyr) # Load tidyr
library(sparcl) # Sparse Clustering
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
[30m── [1mAttaching packages[22m ───────────────────────────────────────── tidyverse 1.3.0 ──[39m
[30m[32m✓[30m [34mtibble [30m 3.1.6 [32m✓[30m [34mdplyr [30m 1.0.4
[32m✓[30m [34mreadr [30m 1.4.0 [32m✓[30m [34mstringr[30m 1.4.0
[32m✓[30m [34mpurrr [30m 0.3.4 [32m✓[30m [34mforcats[30m 0.5.1[39m
[30m── [1mConflicts[22m ──────────────────────────────────────────── tidyverse_conflicts() ──
[31mx[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31mx[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()[39m
data <- read.csv("athlete_data.csv")
summary(data)
Participant.ID Age..Years. Sex Sport
Min. : 1.00 Min. :-1.00 Min. :0.0000 Min. :1.00
1st Qu.: 46.50 1st Qu.:13.00 1st Qu.:0.0000 1st Qu.:2.00
Median : 86.00 Median :14.00 Median :0.0000 Median :2.00
Mean : 85.68 Mean :14.33 Mean :0.4903 Mean :2.89
3rd Qu.:126.50 3rd Qu.:16.00 3rd Qu.:1.0000 3rd Qu.:5.00
Max. :165.00 Max. :18.00 Max. :1.0000 Max. :6.00
Concussion.History Concussion.Number Learning.Disability Anxiety.Diagnosis
Min. :0.0000 Min. :0.0000 Min. :-1.00000 Min. :-1.00000
1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 0.00000 1st Qu.: 0.00000
Median :1.0000 Median :0.0000 Median : 0.00000 Median : 0.00000
Mean :0.6839 Mean :0.4258 Mean : 0.08387 Mean : 0.09677
3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.: 0.00000 3rd Qu.: 0.00000
Max. :1.0000 Max. :3.0000 Max. : 1.00000 Max. : 1.00000
Anxiety.Symptoms Depression.Diagnosis X..of.Prior.Depressive.Episodes
Min. :-1.0000 Min. :-1.00000 Min. :-1.000
1st Qu.: 1.0000 1st Qu.: 0.00000 1st Qu.: 0.000
Median : 1.0000 Median : 0.00000 Median : 0.000
Mean : 0.7613 Mean : 0.05806 Mean : 0.471
3rd Qu.: 1.0000 3rd Qu.: 0.00000 3rd Qu.: 0.000
Max. : 1.0000 Max. : 1.00000 Max. : 4.000
Prior.Depressive.Episode.s..Y.N Aggregate.Medical.History
Min. :-1.0000 Min. :-1.0000
1st Qu.: 0.0000 1st Qu.: 0.0000
Median : 0.0000 Median : 0.0000
Mean : 0.1677 Mean : 0.1613
3rd Qu.: 0.0000 3rd Qu.: 0.0000
Max. : 1.0000 Max. : 1.0000
PCS.Symptom.Frequency..22. PCS.Symptom.Severity..132. MFQ.66
Min. : 0.000 Min. : 0.00 Min. : 0.000
1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 1.000
Median : 2.000 Median : 3.00 Median : 3.000
Mean : 4.219 Mean : 10.14 Mean : 7.787
3rd Qu.: 6.000 3rd Qu.: 11.00 3rd Qu.: 9.000
Max. :22.000 Max. :104.00 Max. :53.000
MFQ.Cut.off PCS.1 PCS.2 PCS.3
Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.00000
1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
Median :0.00000 Median :0.0000 Median :0.0000 Median :0.00000
Mean :0.08387 Mean :0.6129 Mean :0.1935 Mean :0.08387
3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.00000
Max. :1.00000 Max. :6.0000 Max. :5.0000 Max. :5.00000
PCS.4 PCS.5 PCS.6 PCS.7
Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0
1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0
Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0
Mean :0.2645 Mean :0.3419 Mean :0.9806 Mean :0.8
3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:2.0000 3rd Qu.:1.0
Max. :5.0000 Max. :6.0000 Max. :6.0000 Max. :6.0
PCS.8 PCS.9 PCS.10 PCS.11
Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
Mean :0.5806 Mean :0.5548 Mean :0.4258 Mean :0.2581
3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
Max. :6.0000 Max. :6.0000 Max. :5.0000 Max. :6.0000
PCS12 PCS.13 PCS.14 PCS.15
Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
Mean :0.2258 Mean :0.4774 Mean :0.5419 Mean :0.7419
3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:1.0000
Max. :6.0000 Max. :6.0000 Max. :6.0000 Max. :5.0000
PCS.16 PCS.17 PCS.18 PCS.19
Min. :0.0000 Min. :0.0000 Min. :0.000 Min. :0.0000
1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.0000
Median :0.0000 Median :0.0000 Median :0.000 Median :0.0000
Mean :0.4839 Mean :0.2581 Mean :0.329 Mean :0.4129
3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.000 3rd Qu.:0.0000
Max. :6.0000 Max. :6.0000 Max. :6.000 Max. :6.0000
PCS.20 PCS.21 PCS.22 MFQ.1
Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
Mean :0.6903 Mean :0.6065 Mean :0.2774 Mean :0.4065
3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:1.0000
Max. :6.0000 Max. :6.0000 Max. :5.0000 Max. :2.0000
MFQ.2 MFQ.3 MFQ.4 MFQ.5
Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :0.0000
1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000
Median :0.0000 Median :0.000 Median :0.0000 Median :0.0000
Mean :0.1419 Mean :0.271 Mean :0.2903 Mean :0.5677
3rd Qu.:0.0000 3rd Qu.:0.000 3rd Qu.:0.0000 3rd Qu.:1.0000
Max. :2.0000 Max. :2.000 Max. :2.0000 Max. :2.0000
MFQ.6 MFQ.7 MFQ.8 MFQ9
Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
Mean :0.1677 Mean :0.2387 Mean :0.1871 Mean :0.2903
3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
Max. :2.0000 Max. :2.0000 Max. :2.0000 Max. :2.0000
MFQ.10 MFQ11 MFQ.12 MFQ.13
Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
Median :0.0000 Median :0.0000 Median :0.0000 Median :0.00000
Mean :0.5032 Mean :0.4258 Mean :0.3097 Mean :0.09677
3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.5000 3rd Qu.:0.00000
Max. :2.0000 Max. :2.0000 Max. :2.0000 Max. :2.00000
MFQ.14 MFQ.15 MFQ.16 MFQ.17
Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.0000
1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000
Median :0.0000 Median :0.0000 Median :0.00000 Median :0.0000
Mean :0.1677 Mean :0.1419 Mean :0.09032 Mean :0.1161
3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.0000
Max. :2.0000 Max. :2.0000 Max. :2.00000 Max. :2.0000
MFQ.18 MFQ.19 MFQ.20 MFQ.21
Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.0000
1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000
Median :0.00000 Median :0.00000 Median :0.0000 Median :0.0000
Mean :0.09032 Mean :0.06452 Mean :0.1806 Mean :0.3806
3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:1.0000
Max. :2.00000 Max. :2.00000 Max. :2.0000 Max. :2.0000
MFQ.22 MFQ.23 MFQ.24 MFQ.25
Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
Mean :0.1806 Mean :0.1548 Mean :0.1935 Mean :0.2581
3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
Max. :2.0000 Max. :2.0000 Max. :2.0000 Max. :2.0000
MFQ.26 MFQ.27 MFQ.28 MFQ.29
Min. :0.000 Min. :0.0000 Min. :0.0000 Min. :0.0000
1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
Median :0.000 Median :0.0000 Median :0.0000 Median :0.0000
Mean :0.329 Mean :0.2323 Mean :0.1226 Mean :0.3097
3rd Qu.:1.000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
Max. :2.000 Max. :2.0000 Max. :2.0000 Max. :2.0000
MFQ.30 MFQ.31 MFQ.32 MFQ.33
Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
Mean :0.2065 Mean :0.1484 Mean :0.2839 Mean :0.2387
3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
Max. :2.0000 Max. :2.0000 Max. :2.0000 Max. :2.0000
head(data)
tail(data)
dim(data)
[1] 155 72
names(data)
[1] "Participant.ID" "Age..Years."
[3] "Sex" "Sport"
[5] "Concussion.History" "Concussion.Number"
[7] "Learning.Disability" "Anxiety.Diagnosis"
[9] "Anxiety.Symptoms" "Depression.Diagnosis"
[11] "X..of.Prior.Depressive.Episodes" "Prior.Depressive.Episode.s..Y.N"
[13] "Aggregate.Medical.History" "PCS.Symptom.Frequency..22."
[15] "PCS.Symptom.Severity..132." "MFQ.66"
[17] "MFQ.Cut.off" "PCS.1"
[19] "PCS.2" "PCS.3"
[21] "PCS.4" "PCS.5"
[23] "PCS.6" "PCS.7"
[25] "PCS.8" "PCS.9"
[27] "PCS.10" "PCS.11"
[29] "PCS12" "PCS.13"
[31] "PCS.14" "PCS.15"
[33] "PCS.16" "PCS.17"
[35] "PCS.18" "PCS.19"
[37] "PCS.20" "PCS.21"
[39] "PCS.22" "MFQ.1"
[41] "MFQ.2" "MFQ.3"
[43] "MFQ.4" "MFQ.5"
[45] "MFQ.6" "MFQ.7"
[47] "MFQ.8" "MFQ9"
[49] "MFQ.10" "MFQ11"
[51] "MFQ.12" "MFQ.13"
[53] "MFQ.14" "MFQ.15"
[55] "MFQ.16" "MFQ.17"
[57] "MFQ.18" "MFQ.19"
[59] "MFQ.20" "MFQ.21"
[61] "MFQ.22" "MFQ.23"
[63] "MFQ.24" "MFQ.25"
[65] "MFQ.26" "MFQ.27"
[67] "MFQ.28" "MFQ.29"
[69] "MFQ.30" "MFQ.31"
[71] "MFQ.32" "MFQ.33"
sapply(data, class)
Participant.ID Age..Years.
"integer" "integer"
Sex Sport
"integer" "integer"
Concussion.History Concussion.Number
"integer" "integer"
Learning.Disability Anxiety.Diagnosis
"integer" "integer"
Anxiety.Symptoms Depression.Diagnosis
"integer" "integer"
X..of.Prior.Depressive.Episodes Prior.Depressive.Episode.s..Y.N
"integer" "integer"
Aggregate.Medical.History PCS.Symptom.Frequency..22.
"integer" "integer"
PCS.Symptom.Severity..132. MFQ.66
"integer" "integer"
MFQ.Cut.off PCS.1
"integer" "integer"
PCS.2 PCS.3
"integer" "integer"
PCS.4 PCS.5
"integer" "integer"
PCS.6 PCS.7
"integer" "integer"
PCS.8 PCS.9
"integer" "integer"
PCS.10 PCS.11
"integer" "integer"
PCS12 PCS.13
"integer" "integer"
PCS.14 PCS.15
"integer" "integer"
PCS.16 PCS.17
"integer" "integer"
PCS.18 PCS.19
"integer" "integer"
PCS.20 PCS.21
"integer" "integer"
PCS.22 MFQ.1
"integer" "integer"
MFQ.2 MFQ.3
"integer" "integer"
MFQ.4 MFQ.5
"integer" "integer"
MFQ.6 MFQ.7
"integer" "integer"
MFQ.8 MFQ9
"integer" "integer"
MFQ.10 MFQ11
"integer" "integer"
MFQ.12 MFQ.13
"integer" "integer"
MFQ.14 MFQ.15
"integer" "integer"
MFQ.16 MFQ.17
"integer" "integer"
MFQ.18 MFQ.19
"integer" "integer"
MFQ.20 MFQ.21
"integer" "integer"
MFQ.22 MFQ.23
"integer" "integer"
MFQ.24 MFQ.25
"integer" "integer"
MFQ.26 MFQ.27
"integer" "integer"
MFQ.28 MFQ.29
"integer" "integer"
MFQ.30 MFQ.31
"integer" "integer"
MFQ.32 MFQ.33
"integer" "integer"
summary(as.factor(data$Concussion.History))
0 1
49 106
#convert -1s into null values and then impute them
sum(data == -1)
[1] 12
data [data == -1] <- NA
data <- data[ -c(73) ]
head(data)
sum(is.na(data))
[1] 12
#find columns with null values
colnames(data)[colSums(is.na(data)) > 0]
[1] "Age..Years." "Learning.Disability"
[3] "Anxiety.Diagnosis" "Anxiety.Symptoms"
[5] "Depression.Diagnosis" "X..of.Prior.Depressive.Episodes"
[7] "Prior.Depressive.Episode.s..Y.N" "Aggregate.Medical.History"
#imputation
data$Age..Years.[is.na(data$Age..Years.)]<-mean(data$Age..Years.,na.rm=TRUE)
data$Learning.Disability[is.na(data$Learning.Disability)]<-mean(data$Learning.Disability,na.rm=TRUE)
data$Anxiety.Diagnosis[is.na(data$Anxiety.Diagnosis)]<-mean(data$Anxiety.Diagnosis,na.rm=TRUE)
data$Anxiety.Symptoms[is.na(data$Anxiety.Symptoms)]<-mean(data$Anxiety.Symptoms,na.rm=TRUE)
data$Depression.Diagnosis[is.na(data$Depression.Diagnosis)]<-mean(data$Depression.Diagnosis,na.rm=TRUE)
data$X..of.Prior.Depressive.Episodes[is.na(data$X..of.Prior.Depressive.Episodes)]<-mean(data$X..of.Prior.Depressive.Episodes,na.rm=TRUE)
data$Prior.Depressive.Episode.s..Y.N[is.na(data$Prior.Depressive.Episode.s..Y.N)]<-mean(data$Prior.Depressive.Episode.s..Y.N,na.rm=TRUE)
data$Aggregate.Medical.History[is.na(data$Aggregate.Medical.History)]<-mean(data$Aggregate.Medical.History,na.rm=TRUE)
sum(is.na(data))
[1] 0
plot_dat <- data
plot_dat$Concussion.History <- as.factor(plot_dat$Concussion.History)
g_1 <- ggplot(plot_dat, aes(x = PCS.Symptom.Severity..132., fill = Concussion.History)) +
geom_density(alpha = 0.5) +
theme_set(theme_bw(base_size = 11) ) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank()) +
labs(x = "Symptom Severity", title = "Relationship Between Symptom Severity and Concussion History", fill = "Diagnosis") + # Set labels
scale_fill_manual(values = c("1" = "red", "0" = "blue"),
labels = c("1" = "Concussion History", "0" = "No Concussion History"))
g_1
plot_dat <- data
plot_dat$Depression.Diagnosis <- as.factor(plot_dat$Depression.Diagnosis)
g_2 <- ggplot(plot_dat, aes(x = PCS.Symptom.Severity..132., fill = Depression.Diagnosis)) +
geom_density(alpha = 0.5) +
theme_set(theme_bw(base_size = 11) ) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank()) +
labs(x = "Symptom Severity", title = "Relationship Between Symptom Severity and Depression Diagnosis", fill = "Diagnosis") + # Set labels
scale_fill_manual(values = c("1" = "red", "0" = "blue"),
labels = c("1" = "Depression Diagnosis", "0" = "No Diagnosis"))
g_2
plot_dat <- data
plot_dat$Depression.Diagnosis <- as.factor(plot_dat$Depression.Diagnosis)
g_4 <- ggplot(plot_dat, aes(x = factor(Concussion.History), fill = Depression.Diagnosis)) +
geom_bar(alpha = 0.5, position = "dodge") +
theme_set(theme_bw(base_size = 11) ) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank()) +
labs(x = "Concussion History", title = "Relationship Between Concussion History and Depression Diagnosis", fill = "Diagnosis") + # Set labels
scale_fill_manual(values = c("1" = "red", "0" = "blue"),
labels = c("1" = "Depression Diagnosis", "0" = "No Diagnosis"))
g_4
plot_dat <- data
plot_dat$Anxiety.Diagnosis <- as.factor(plot_dat$Anxiety.Diagnosis)
g_3 <- ggplot(plot_dat, aes(x = PCS.Symptom.Severity..132., fill = Anxiety.Diagnosis)) +
geom_density(alpha = 0.5) +
theme_set(theme_bw(base_size = 11) ) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank()) +
labs(x = "Symptom Severity", title = "Relationship Between Symptom Severity and Anxiety Diagnosis", fill = "Diagnosis") + # Set labels
scale_fill_manual(values = c("1" = "red", "0" = "blue"),
labels = c("1" = "Anxiety Diagnosis", "0" = "No Diagnosis"))
g_3
Hierarchical Clustering
sdata <- scale(data[,-1])
summary(sdata)
Age..Years. Sex Sport Concussion.History
Min. :-1.216 Min. :-0.9777 Min. :-1.1298 Min. :-1.4661
1st Qu.:-1.216 1st Qu.:-0.9777 1st Qu.:-0.5321 1st Qu.:-1.4661
Median : 0.000 Median :-0.9777 Median :-0.5321 Median : 0.6777
Mean : 0.000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
3rd Qu.: 1.020 3rd Qu.: 1.0163 3rd Qu.: 1.2609 3rd Qu.: 0.6777
Max. : 2.510 Max. : 1.0163 Max. : 1.8585 Max. : 0.6777
Concussion.Number Learning.Disability Anxiety.Diagnosis Anxiety.Symptoms
Min. :-0.5912 Min. :-0.3162 Min. :-0.3405 Min. :-1.9886
1st Qu.:-0.5912 1st Qu.:-0.3162 1st Qu.:-0.3405 1st Qu.: 0.5095
Median :-0.5912 Median :-0.3162 Median :-0.3405 Median : 0.5095
Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
3rd Qu.: 0.7972 3rd Qu.:-0.3162 3rd Qu.:-0.3405 3rd Qu.: 0.5095
Max. : 3.5739 Max. : 3.1623 Max. : 2.9368 Max. : 0.5095
Depression.Diagnosis X..of.Prior.Depressive.Episodes
Min. :-0.2635 Min. :-0.4111
1st Qu.:-0.2635 1st Qu.:-0.4111
Median :-0.2635 Median :-0.4111
Mean : 0.0000 Mean : 0.0000
3rd Qu.:-0.2635 3rd Qu.:-0.4111
Max. : 3.7947 Max. : 3.0107
Prior.Depressive.Episode.s..Y.N Aggregate.Medical.History
Min. :-0.4611 Min. :-0.4507
1st Qu.:-0.4611 1st Qu.:-0.4507
Median :-0.4611 Median :-0.4507
Mean : 0.0000 Mean : 0.0000
3rd Qu.:-0.4611 3rd Qu.:-0.4507
Max. : 2.1688 Max. : 2.2188
PCS.Symptom.Frequency..22. PCS.Symptom.Severity..132. MFQ.66
Min. :-0.827 Min. :-0.57098 Min. :-0.6845
1st Qu.:-0.827 1st Qu.:-0.57098 1st Qu.:-0.5966
Median :-0.435 Median :-0.40208 Median :-0.4208
Mean : 0.000 Mean : 0.00000 Mean : 0.0000
3rd Qu.: 0.349 3rd Qu.: 0.04831 3rd Qu.: 0.1066
Max. : 3.485 Max. : 5.28412 Max. : 3.9746
MFQ.Cut.off PCS.1 PCS.2 PCS.3
Min. :-0.3016 Min. :-0.5051 Min. :-0.2587 Min. :-0.1504
1st Qu.:-0.3016 1st Qu.:-0.5051 1st Qu.:-0.2587 1st Qu.:-0.1504
Median :-0.3016 Median :-0.5051 Median :-0.2587 Median :-0.1504
Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
3rd Qu.:-0.3016 3rd Qu.: 0.3190 3rd Qu.:-0.2587 3rd Qu.:-0.1504
Max. : 3.2943 Max. : 4.4393 Max. : 6.4246 Max. : 8.8140
PCS.4 PCS.5 PCS.6 PCS.7
Min. :-0.3313 Min. :-0.3654 Min. :-0.6610 Min. :-0.556
1st Qu.:-0.3313 1st Qu.:-0.3654 1st Qu.:-0.6610 1st Qu.:-0.556
Median :-0.3313 Median :-0.3654 Median :-0.6610 Median :-0.556
Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.000
3rd Qu.:-0.3313 3rd Qu.:-0.3654 3rd Qu.: 0.6871 3rd Qu.: 0.139
Max. : 5.9316 Max. : 6.0459 Max. : 3.3833 Max. : 3.614
PCS.8 PCS.9 PCS.10 PCS.11
Min. :-0.4263 Min. :-0.4423 Min. :-0.4153 Min. :-0.2653
1st Qu.:-0.4263 1st Qu.:-0.4423 1st Qu.:-0.4153 1st Qu.:-0.2653
Median :-0.4263 Median :-0.4423 Median :-0.4153 Median :-0.2653
Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
3rd Qu.:-0.4263 3rd Qu.:-0.4423 3rd Qu.:-0.4153 3rd Qu.:-0.2653
Max. : 3.9785 Max. : 4.3412 Max. : 4.4612 Max. : 5.9037
PCS12 PCS.13 PCS.14 PCS.15
Min. :-0.276 Min. :-0.4291 Min. :-0.4059 Min. :-0.6102
1st Qu.:-0.276 1st Qu.:-0.4291 1st Qu.:-0.4059 1st Qu.:-0.6102
Median :-0.276 Median :-0.4291 Median :-0.4059 Median :-0.6102
Mean : 0.000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
3rd Qu.:-0.276 3rd Qu.:-0.4291 3rd Qu.:-0.4059 3rd Qu.: 0.2122
Max. : 7.057 Max. : 4.9632 Max. : 4.0883 Max. : 3.5020
PCS.16 PCS.17 PCS.18 PCS.19
Min. :-0.3886 Min. :-0.2812 Min. :-0.3167 Min. :-0.3458
1st Qu.:-0.3886 1st Qu.:-0.2812 1st Qu.:-0.3167 1st Qu.:-0.3458
Median :-0.3886 Median :-0.2812 Median :-0.3167 Median :-0.3458
Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
3rd Qu.:-0.3886 3rd Qu.:-0.2812 3rd Qu.:-0.3167 3rd Qu.:-0.3458
Max. : 4.4306 Max. : 6.2573 Max. : 5.4585 Max. : 4.6790
PCS.20 PCS.21 PCS.22 MFQ.1
Min. :-0.4937 Min. :-0.4555 Min. :-0.303 Min. :-0.7177
1st Qu.:-0.4937 1st Qu.:-0.4555 1st Qu.:-0.303 1st Qu.:-0.7177
Median :-0.4937 Median :-0.4555 Median :-0.303 Median :-0.7177
Mean : 0.0000 Mean : 0.0000 Mean : 0.000 Mean : 0.0000
3rd Qu.: 0.2215 3rd Qu.:-0.4555 3rd Qu.:-0.303 3rd Qu.: 1.0480
Max. : 3.7971 Max. : 4.0510 Max. : 5.159 Max. : 2.8138
MFQ.2 MFQ.3 MFQ.4 MFQ.5
Min. :-0.3683 Min. :-0.4925 Min. :-0.5204 Min. :-0.7779
1st Qu.:-0.3683 1st Qu.:-0.4925 1st Qu.:-0.5204 1st Qu.:-0.7779
Median :-0.3683 Median :-0.4925 Median :-0.5204 Median :-0.7779
Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
3rd Qu.:-0.3683 3rd Qu.:-0.4925 3rd Qu.:-0.5204 3rd Qu.: 0.5922
Max. : 4.8208 Max. : 3.1425 Max. : 3.0644 Max. : 1.9623
MFQ.6 MFQ.7 MFQ.8 MFQ9
Min. :-0.3701 Min. :-0.4268 Min. :-0.3691 Min. :-0.4817
1st Qu.:-0.3701 1st Qu.:-0.4268 1st Qu.:-0.3691 1st Qu.:-0.4817
Median :-0.3701 Median :-0.4268 Median :-0.3691 Median :-0.4817
Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
3rd Qu.:-0.3701 3rd Qu.:-0.4268 3rd Qu.:-0.3691 3rd Qu.:-0.4817
Max. : 4.0424 Max. : 3.1493 Max. : 3.5763 Max. : 2.8368
MFQ.10 MFQ11 MFQ.12 MFQ.13
Min. :-0.7322 Min. :-0.6509 Min. :-0.5374 Min. :-0.2716
1st Qu.:-0.7322 1st Qu.:-0.6509 1st Qu.:-0.5374 1st Qu.:-0.2716
Median :-0.7322 Median :-0.6509 Median :-0.5374 Median :-0.2716
Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
3rd Qu.: 0.7228 3rd Qu.: 0.8778 3rd Qu.: 0.3303 3rd Qu.:-0.2716
Max. : 2.1777 Max. : 2.4065 Max. : 2.9334 Max. : 5.3419
MFQ.14 MFQ.15 MFQ.16 MFQ.17
Min. :-0.3487 Min. :-0.3278 Min. :-0.274 Min. :-0.2947
1st Qu.:-0.3487 1st Qu.:-0.3278 1st Qu.:-0.274 1st Qu.:-0.2947
Median :-0.3487 Median :-0.3278 Median :-0.274 Median :-0.2947
Mean : 0.0000 Mean : 0.0000 Mean : 0.000 Mean : 0.0000
3rd Qu.:-0.3487 3rd Qu.:-0.3278 3rd Qu.:-0.274 3rd Qu.:-0.2947
Max. : 3.8088 Max. : 4.2908 Max. : 5.793 Max. : 4.7811
MFQ.18 MFQ.19 MFQ.20 MFQ.21
Min. :-0.274 Min. :-0.2191 Min. :-0.403 Min. :-0.6071
1st Qu.:-0.274 1st Qu.:-0.2191 1st Qu.:-0.403 1st Qu.:-0.6071
Median :-0.274 Median :-0.2191 Median :-0.403 Median :-0.6071
Mean : 0.000 Mean : 0.0000 Mean : 0.000 Mean : 0.0000
3rd Qu.:-0.274 3rd Qu.:-0.2191 3rd Qu.:-0.403 3rd Qu.: 0.9878
Max. : 5.793 Max. : 6.5725 Max. : 4.059 Max. : 2.5827
MFQ.22 MFQ.23 MFQ.24 MFQ.25
Min. :-0.403 Min. :-0.319 Min. :-0.3886 Min. :-0.4738
1st Qu.:-0.403 1st Qu.:-0.319 1st Qu.:-0.3886 1st Qu.:-0.4738
Median :-0.403 Median :-0.319 Median :-0.3886 Median :-0.4738
Mean : 0.000 Mean : 0.000 Mean : 0.0000 Mean : 0.0000
3rd Qu.:-0.403 3rd Qu.:-0.319 3rd Qu.:-0.3886 3rd Qu.:-0.4738
Max. : 4.059 Max. : 3.801 Max. : 3.6274 Max. : 3.1985
MFQ.26 MFQ.27 MFQ.28 MFQ.29
Min. :-0.5762 Min. :-0.3935 Min. :-0.2747 Min. :-0.4915
1st Qu.:-0.5762 1st Qu.:-0.3935 1st Qu.:-0.2747 1st Qu.:-0.4915
Median :-0.5762 Median :-0.3935 Median :-0.2747 Median :-0.4915
Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
3rd Qu.: 1.1750 3rd Qu.:-0.3935 3rd Qu.:-0.2747 3rd Qu.:-0.4915
Max. : 2.9261 Max. : 2.9953 Max. : 4.2069 Max. : 2.6828
MFQ.30 MFQ.31 MFQ.32 MFQ.33
Min. :-0.3889 Min. :-0.3385 Min. :-0.4816 Min. :-0.4268
1st Qu.:-0.3889 1st Qu.:-0.3385 1st Qu.:-0.4816 1st Qu.:-0.4268
Median :-0.3889 Median :-0.3385 Median :-0.4816 Median :-0.4268
Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
3rd Qu.:-0.3889 3rd Qu.:-0.3385 3rd Qu.:-0.4816 3rd Qu.:-0.4268
Max. : 3.3788 Max. : 4.2243 Max. : 2.9113 Max. : 3.1493
# Calculate distances between points
dist_mat <- dist(sdata, # Set dataset
method = "euclidean") # Set distance measure to use
# Run hierarchical clustering
hc <- hclust(dist_mat, # Set distance matrix to use
method = "average" ) # Set linkage measure to use, for all the points in the cluster, what is the avg distance
plot(hc, # Set hierarchical clustering as plot object
cex = 0.6, # Set text size
hang = -1 ) # Set label position
# Create dendrogram
dend <- as.dendrogram(hc)
# order it the closest we can to the order of the observations:
dend <- rotate(dend, 1:50)
number of items to replace is not a multiple of replacement length
# Color the branches based on the clusters:
dend <- color_branches(dend, k=10)
# We hang the dendrogram a bit:
dend <- hang.dendrogram(dend,hang_height=0.1)
# reduce the size of the labels:
dend <- set(dend, "labels_cex", 0.55)
# And plot:
par(mar = c(3,3,3,7))
plot(dend,
main = "Clustered Concussion Data",
horiz = TRUE, nodePar = list(cex = .007))
clusters <- cutree(hc, # Specify object
k = 5) # Specify number of clusters
#look @ every cluster and find avg of variables
clustermeans <- as.data.frame(matrix(NA, nrow = 5, ncol = ncol(sdata)))
for(i in 1:5){
if (sum(clusters == i) > 1){
clustermeans[i, ] <- colMeans(sdata[clusters == i,])
}
else{
clustermeans[i,] <- sdata[clusters == i,]
}
}
names(clustermeans) <- names(data)[-1]
clustermeans
summary(as.factor(clusters))
1 2 3 4 5
146 5 2 1 1
fviz_cluster(list(data = data, # Set data
cluster = clusters)) # Set clusters
cbind(row.names(data), clusters)
clusters
[1,] "1" "1"
[2,] "2" "1"
[3,] "3" "1"
[4,] "4" "1"
[5,] "5" "1"
[6,] "6" "1"
[7,] "7" "1"
[8,] "8" "1"
[9,] "9" "1"
[10,] "10" "1"
[11,] "11" "1"
[12,] "12" "2"
[13,] "13" "1"
[14,] "14" "2"
[15,] "15" "1"
[16,] "16" "1"
[17,] "17" "1"
[18,] "18" "1"
[19,] "19" "1"
[20,] "20" "1"
[21,] "21" "1"
[22,] "22" "1"
[23,] "23" "1"
[24,] "24" "1"
[25,] "25" "1"
[26,] "26" "1"
[27,] "27" "1"
[28,] "28" "1"
[29,] "29" "1"
[30,] "30" "1"
[31,] "31" "1"
[32,] "32" "1"
[33,] "33" "1"
[34,] "34" "1"
[35,] "35" "1"
[36,] "36" "1"
[37,] "37" "1"
[38,] "38" "1"
[39,] "39" "1"
[40,] "40" "1"
[41,] "41" "1"
[42,] "42" "1"
[43,] "43" "1"
[44,] "44" "1"
[45,] "45" "1"
[46,] "46" "3"
[47,] "47" "1"
[48,] "48" "1"
[49,] "49" "1"
[50,] "50" "1"
[51,] "51" "2"
[52,] "52" "1"
[53,] "53" "1"
[54,] "54" "1"
[55,] "55" "1"
[56,] "56" "1"
[57,] "57" "1"
[58,] "58" "1"
[59,] "59" "1"
[60,] "60" "1"
[61,] "61" "1"
[62,] "62" "1"
[63,] "63" "1"
[64,] "64" "3"
[65,] "65" "1"
[66,] "66" "1"
[67,] "67" "1"
[68,] "68" "1"
[69,] "69" "1"
[70,] "70" "1"
[71,] "71" "1"
[72,] "72" "2"
[73,] "73" "1"
[74,] "74" "4"
[75,] "75" "1"
[76,] "76" "1"
[77,] "77" "1"
[78,] "78" "2"
[79,] "79" "5"
[80,] "80" "1"
[81,] "81" "1"
[82,] "82" "1"
[83,] "83" "1"
[84,] "84" "1"
[85,] "85" "1"
[86,] "86" "1"
[87,] "87" "1"
[88,] "88" "1"
[89,] "89" "1"
[90,] "90" "1"
[91,] "91" "1"
[92,] "92" "1"
[93,] "93" "1"
[94,] "94" "1"
[95,] "95" "1"
[96,] "96" "1"
[97,] "97" "1"
[98,] "98" "1"
[99,] "99" "1"
[100,] "100" "1"
[101,] "101" "1"
[102,] "102" "1"
[103,] "103" "1"
[104,] "104" "1"
[105,] "105" "1"
[106,] "106" "1"
[107,] "107" "1"
[108,] "108" "1"
[109,] "109" "1"
[110,] "110" "1"
[111,] "111" "1"
[112,] "112" "1"
[113,] "113" "1"
[114,] "114" "1"
[115,] "115" "1"
[116,] "116" "1"
[117,] "117" "1"
[118,] "118" "1"
[119,] "119" "1"
[120,] "120" "1"
[121,] "121" "1"
[122,] "122" "1"
[123,] "123" "1"
[124,] "124" "1"
[125,] "125" "1"
[126,] "126" "1"
[127,] "127" "1"
[128,] "128" "1"
[129,] "129" "1"
[130,] "130" "1"
[131,] "131" "1"
[132,] "132" "1"
[133,] "133" "1"
[134,] "134" "1"
[135,] "135" "1"
[136,] "136" "1"
[137,] "137" "1"
[138,] "138" "1"
[139,] "139" "1"
[140,] "140" "1"
[141,] "141" "1"
[142,] "142" "1"
[143,] "143" "1"
[144,] "144" "1"
[145,] "145" "1"
[146,] "146" "1"
[147,] "147" "1"
[148,] "148" "1"
[149,] "149" "1"
[150,] "150" "1"
[151,] "151" "1"
[152,] "152" "1"
[153,] "153" "1"
[154,] "154" "1"
[155,] "155" "1"
colMeans(data[clusters == 1,])
Participant.ID Age..Years.
85.520547945 14.618240807
Sex Sport
0.520547945 2.801369863
Concussion.History Concussion.Number
0.684931507 0.424657534
Learning.Disability Anxiety.Diagnosis
0.075965131 0.089752713
Anxiety.Symptoms Depression.Diagnosis
0.824576424 0.048389966
X..of.Prior.Depressive.Episodes Prior.Depressive.Episode.s..Y.N
0.366304928 0.138187155
Aggregate.Medical.History PCS.Symptom.Frequency..22.
0.151841309 3.390410959
PCS.Symptom.Severity..132. MFQ.66
6.842465753 5.609589041
MFQ.Cut.off PCS.1
0.027397260 0.465753425
PCS.2 PCS.3
0.123287671 0.027397260
PCS.4 PCS.5
0.150684932 0.212328767
PCS.6 PCS.7
0.787671233 0.657534247
PCS.8 PCS.9
0.431506849 0.417808219
PCS.10 PCS.11
0.273972603 0.130136986
PCS12 PCS.13
0.123287671 0.321917808
PCS.14 PCS.15
0.328767123 0.595890411
PCS.16 PCS.17
0.232876712 0.123287671
PCS.18 PCS.19
0.157534247 0.205479452
PCS.20 PCS.21
0.472602740 0.452054795
PCS.22 MFQ.1
0.150684932 0.349315068
MFQ.2 MFQ.3
0.095890411 0.226027397
MFQ.4 MFQ.5
0.253424658 0.506849315
MFQ.6 MFQ.7
0.102739726 0.178082192
MFQ.8 MFQ9
0.089041096 0.198630137
MFQ.10 MFQ11
0.431506849 0.356164384
MFQ.12 MFQ.13
0.253424658 0.041095890
MFQ.14 MFQ.15
0.095890411 0.082191781
MFQ.16 MFQ.17
0.027397260 0.061643836
MFQ.18 MFQ.19
0.027397260 0.006849315
MFQ.20 MFQ.21
0.136986301 0.294520548
MFQ.22 MFQ.23
0.123287671 0.068493151
MFQ.24 MFQ.25
0.136986301 0.171232877
MFQ.26 MFQ.27
0.273972603 0.130136986
MFQ.28 MFQ.29
0.041095890 0.232876712
MFQ.30 MFQ.31
0.123287671 0.075342466
MFQ.32 MFQ.33
0.219178082 0.198630137
colMeans(data[clusters == 2,])
Participant.ID Age..Years.
102.4 14.6
Sex Sport
0.0 5.4
Concussion.History Concussion.Number
0.6 0.6
Learning.Disability Anxiety.Diagnosis
0.4 0.4
Anxiety.Symptoms Depression.Diagnosis
0.4 0.2
X..of.Prior.Depressive.Episodes Prior.Depressive.Episode.s..Y.N
2.4 0.8
Aggregate.Medical.History PCS.Symptom.Frequency..22.
0.4 15.6
PCS.Symptom.Severity..132. MFQ.66
43.4 44.8
MFQ.Cut.off PCS.1
1.0 1.8
PCS.2 PCS.3
0.2 0.0
PCS.4 PCS.5
1.0 1.0
PCS.6 PCS.7
2.8 1.6
PCS.8 PCS.9
2.0 2.4
PCS.10 PCS.11
1.8 0.8
PCS12 PCS.13
0.6 2.4
PCS.14 PCS.15
3.8 2.4
PCS.16 PCS.17
4.6 1.8
PCS.18 PCS.19
2.4 3.0
PCS.20 PCS.21
3.6 1.4
PCS.22 MFQ.1
2.0 1.8
MFQ.2 MFQ.3
1.2 1.2
MFQ.4 MFQ.5
0.8 1.4
MFQ.6 MFQ.7
1.2 1.0
MFQ.8 MFQ9
2.0 2.0
MFQ.10 MFQ11
1.6 1.8
MFQ.12 MFQ.13
1.2 0.8
MFQ.14 MFQ.15
1.2 1.0
MFQ.16 MFQ.17
1.2 1.2
MFQ.18 MFQ.19
1.4 1.2
MFQ.20 MFQ.21
0.8 1.6
MFQ.22 MFQ.23
1.0 1.6
MFQ.24 MFQ.25
1.4 1.8
MFQ.26 MFQ.27
1.4 2.0
MFQ.28 MFQ.29
1.8 1.8
MFQ.30 MFQ.31
1.6 1.2
MFQ.32 MFQ.33
1.2 0.4
colMeans(data[clusters == 4,])
Participant.ID Age..Years.
71.00000 14.63158
Sex Sport
0.00000 3.00000
Concussion.History Concussion.Number
0.00000 1.00000
Learning.Disability Anxiety.Diagnosis
0.00000 0.00000
Anxiety.Symptoms Depression.Diagnosis
0.00000 1.00000
X..of.Prior.Depressive.Episodes Prior.Depressive.Episode.s..Y.N
1.00000 1.00000
Aggregate.Medical.History PCS.Symptom.Frequency..22.
1.00000 19.00000
PCS.Symptom.Severity..132. MFQ.66
94.00000 35.00000
MFQ.Cut.off PCS.1
1.00000 6.00000
PCS.2 PCS.3
3.00000 2.00000
PCS.4 PCS.5
2.00000 6.00000
PCS.6 PCS.7
6.00000 4.00000
PCS.8 PCS.9
5.00000 0.00000
PCS.10 PCS.11
5.00000 4.00000
PCS12 PCS.13
6.00000 6.00000
PCS.14 PCS.15
6.00000 5.00000
PCS.16 PCS.17
5.00000 0.00000
PCS.18 PCS.19
6.00000 6.00000
PCS.20 PCS.21
5.00000 6.00000
PCS.22 MFQ.1
0.00000 1.00000
MFQ.2 MFQ.3
0.00000 0.00000
MFQ.4 MFQ.5
2.00000 2.00000
MFQ.6 MFQ.7
0.00000 1.00000
MFQ.8 MFQ9
1.00000 2.00000
MFQ.10 MFQ11
2.00000 1.00000
MFQ.12 MFQ.13
0.00000 0.00000
MFQ.14 MFQ.15
2.00000 2.00000
MFQ.16 MFQ.17
2.00000 2.00000
MFQ.18 MFQ.19
0.00000 2.00000
MFQ.20 MFQ.21
0.00000 2.00000
MFQ.22 MFQ.23
0.00000 1.00000
MFQ.24 MFQ.25
0.00000 2.00000
MFQ.26 MFQ.27
0.00000 2.00000
MFQ.28 MFQ.29
0.00000 0.00000
MFQ.30 MFQ.31
2.00000 2.00000
MFQ.32 MFQ.33
0.00000 2.00000
colMeans(data[clusters == 5,])
Participant.ID Age..Years.
77 14
Sex Sport
0 3
Concussion.History Concussion.Number
1 0
Learning.Disability Anxiety.Diagnosis
1 1
Anxiety.Symptoms Depression.Diagnosis
0 1
X..of.Prior.Depressive.Episodes Prior.Depressive.Episode.s..Y.N
4 1
Aggregate.Medical.History PCS.Symptom.Frequency..22.
1 18
PCS.Symptom.Severity..132. MFQ.66
72 49
MFQ.Cut.off PCS.1
1 2
PCS.2 PCS.3
0 0
PCS.4 PCS.5
4 1
PCS.6 PCS.7
6 4
PCS.8 PCS.9
2 6
PCS.10 PCS.11
4 6
PCS12 PCS.13
0 0
PCS.14 PCS.15
1 2
PCS.16 PCS.17
4 6
PCS.18 PCS.19
3 4
PCS.20 PCS.21
6 6
PCS.22 MFQ.1
5 1
MFQ.2 MFQ.3
1 0
MFQ.4 MFQ.5
2 2
MFQ.6 MFQ.7
2 2
MFQ.8 MFQ9
2 2
MFQ.10 MFQ11
2 2
MFQ.12 MFQ.13
2 2
MFQ.14 MFQ.15
2 1
MFQ.16 MFQ.17
0 0
MFQ.18 MFQ.19
1 0
MFQ.20 MFQ.21
2 2
MFQ.22 MFQ.23
2 2
MFQ.24 MFQ.25
1 1
MFQ.26 MFQ.27
2 2
MFQ.28 MFQ.29
2 2
MFQ.30 MFQ.31
2 1
MFQ.32 MFQ.33
2 0
remove outliers
sdata <- sdata[-c(74, 79), ]
set.seed(12345) # Set seed for reproducibility
fit_1 <- kmeans(x = sdata, # Set data as explantory variables
centers = 5, # Set number of clusters
nstart = 25, # Set number of starts
iter.max = 100 ) # Set maximum number of iterations to use
# Extract clusters
clusters_1 <- fit_1$cluster
# Extract centers
centers_1 <- fit_1$centers
# Check samples per cluster
summary(as.factor(clusters_1))
1 2 3 4 5
113 1 6 25 1
# Create vector of clusters
cluster <- c(1: 5)
# Extract centers
center_df <- data.frame(cluster, centers_1)
# Reshape the data
center_reshape <- gather(center_df, features, values, Age..Years.:MFQ.Cut.off)
# View first few rows
head(center_reshape)
# Create plot
g_heat_1 <- ggplot(data = center_reshape, # Set dataset
aes(x = features, y = cluster, fill = values)) + # Set aesthetics
scale_y_continuous(breaks = seq(1, 5, by = 1)) + # Set y axis breaks
geom_tile() + # Geom tile for heatmap
coord_equal() + # Make scale the same for both axis
theme_set(theme_bw(base_size = 22) ) + # Set theme
scale_fill_gradient2(low = "blue", # Choose low color
mid = "white", # Choose mid color
high = "red", # Choose high color
midpoint =0, # Choose mid point
space = "Lab",
na.value ="grey", # Choose NA value
guide = "colourbar", # Set color bar
aesthetics = "fill") + # Select aesthetics to apply
coord_flip() # Rotate plot to view names clearly
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
# Generate plot
g_heat_1
#makes case for dropping clusters 2 and 3 bc their values are so distinct from all of the others
# Create vector of clusters
cluster <- c(1: 5)
# Extract centers
center_df <- data.frame(cluster, centers_1)
# Reshape the data
center_reshape <- gather(center_df, features, values, PCS.1:PCS.22)
# View first few rows
head(center_reshape)
# Create plot
g_heat_1 <- ggplot(data = center_reshape, # Set dataset
aes(x = features, y = cluster, fill = values)) + # Set aesthetics
scale_y_continuous(breaks = seq(1, 5, by = 1)) + # Set y axis breaks
geom_tile() + # Geom tile for heatmap
coord_equal() + # Make scale the same for both axis
theme_set(theme_bw(base_size = 22) ) + # Set theme
scale_fill_gradient2(low = "blue", # Choose low color
mid = "white", # Choose mid color
high = "red", # Choose high color
midpoint =0, # Choose mid point
space = "Lab",
na.value ="grey", # Choose NA value
guide = "colourbar", # Set color bar
aesthetics = "fill") + # Select aesthetics to apply
coord_flip() # Rotate plot to view names clearly
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
# Generate plot
g_heat_1
ggsave(g_heat_1, file = "PCSPlot.jpeg", width = 8, height = 12, dpi = 600)
#makes case for dropping clusters 2 and 3 bc their values are so distinct from all of the others
# Create vector of clusters
cluster <- c(1: 5)
# Extract centers
center_df <- data.frame(cluster, centers_1)
# Reshape the data
center_reshape <- gather(center_df, features, values, MFQ.1:MFQ.33)
# View first few rows
head(center_reshape)
# Create plot
g_heat_1 <- ggplot(data = center_reshape, # Set dataset
aes(x = features, y = cluster, fill = values)) + # Set aesthetics
scale_y_continuous(breaks = seq(1, 5, by = 1)) + # Set y axis breaks
geom_tile() + # Geom tile for heatmap
coord_equal() + # Make scale the same for both axis
theme_set(theme_bw(base_size = 22) ) + # Set theme
scale_fill_gradient2(low = "blue", # Choose low color
mid = "white", # Choose mid color
high = "red", # Choose high color
midpoint =0, # Choose mid point
space = "Lab",
na.value ="grey", # Choose NA value
guide = "colourbar", # Set color bar
aesthetics = "fill") + # Select aesthetics to apply
coord_flip() # Rotate plot to view names clearly
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
# Generate plot
g_heat_1
ggsave(g_heat_1, file = "PCSPlot.jpeg", width = 8, height = 12, dpi = 600)
data2 <- data[!clusters %in% c(2,3,5),]
sdata2 <- scale(data2[,-1])
#head(sdata2)
# Create silhouette plot summary
fviz_nbclust(sdata2, # Set dataset
kmeans,# Set clustering method
method = "silhouette") # Set evaluation method
set.seed(12345) # Set seed for reproducibility
fit_2 <- kmeans(x = sdata2, # Set data as explantory variables
centers = 2, # Set number of clusters
nstart = 25, # Set number of starts
iter.max = 100 ) # Set maximum number of iterations to use
# Extract clusters
clusters_2 <- fit_2$cluster
# Extract centers
centers_2 <- fit_2$centers
# Check samples per cluster
summary(as.factor(clusters_2))
1 2
20 127
# Create vector of clusters
cluster <- c(1: 2)
# Extract centers
center_df <- data.frame(cluster, centers_2)
# Reshape the data
center_reshape <- gather(center_df, features, values, Age..Years.:MFQ.Cut.off)
# View first few rows
head(center_reshape)
# Create plot
g_heat_1 <- ggplot(data = center_reshape, # Set dataset
aes(x = features, y = cluster, fill = values)) + # Set aesthetics
scale_y_continuous(breaks = seq(1, 5, by = 1)) + # Set y axis breaks
geom_tile() + # Geom tile for heatmap
coord_equal() + # Make scale the same for both axis
theme_set(theme_bw(base_size = 22) ) + # Set theme
scale_fill_gradient2(low = "blue", # Choose low color
mid = "white", # Choose mid color
high = "red", # Choose high color
midpoint =0, # Choose mid point
space = "Lab",
na.value ="grey", # Choose NA value
guide = "colourbar", # Set color bar
aesthetics = "fill") + # Select aesthetics to apply
coord_flip() # Rotate plot to view names clearly
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
# Generate plot
g_heat_1
# Create vector of clusters
cluster <- c(1: 2)
# Extract centers
center_df <- data.frame(cluster, centers_2)
# Reshape the data
center_reshape <- gather(center_df, features, values, PCS.1:PCS.22)
# View first few rows
head(center_reshape)
# Create plot
g_heat_1 <- ggplot(data = center_reshape, # Set dataset
aes(x = features, y = cluster, fill = values)) + # Set aesthetics
scale_y_continuous(breaks = seq(1, 5, by = 1)) + # Set y axis breaks
geom_tile() + # Geom tile for heatmap
coord_equal() + # Make scale the same for both axis
theme_set(theme_bw(base_size = 22) ) + # Set theme
scale_fill_gradient2(low = "blue", # Choose low color
mid = "white", # Choose mid color
high = "red", # Choose high color
midpoint =0, # Choose mid point
space = "Lab",
na.value ="grey", # Choose NA value
guide = "colourbar", # Set color bar
aesthetics = "fill") + # Select aesthetics to apply
coord_flip() # Rotate plot to view names clearly
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
# Generate plot
g_heat_1
# Create vector of clusters
cluster <- c(1: 2)
# Extract centers
center_df <- data.frame(cluster, centers_2)
# Reshape the data
center_reshape <- gather(center_df, features, values, MFQ.1:MFQ.33)
# View first few rows
head(center_reshape)
# Create plot
g_heat_1 <- ggplot(data = center_reshape, # Set dataset
aes(x = features, y = cluster, fill = values)) + # Set aesthetics
scale_y_continuous(breaks = seq(1, 5, by = 1)) + # Set y axis breaks
geom_tile() + # Geom tile for heatmap
coord_equal() + # Make scale the same for both axis
theme_set(theme_bw(base_size = 22) ) + # Set theme
scale_fill_gradient2(low = "blue", # Choose low color
mid = "white", # Choose mid color
high = "red", # Choose high color
midpoint =0, # Choose mid point
space = "Lab",
na.value ="grey", # Choose NA value
guide = "colourbar", # Set color bar
aesthetics = "fill") + # Select aesthetics to apply
coord_flip() # Rotate plot to view names clearly
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
# Generate plot
g_heat_1
#want our clusters to be balanced if possible (bad if cluster w much lower value)
plot_clust_cardinality <- as.data.frame(clusters_2)
names(plot_clust_cardinality) <- c("k_2") # Set names
# Create bar plots
g_2 <- ggplot(plot_clust_cardinality, aes( x = factor(k_2))) + # Set x as cluster values
geom_bar(stat = "count", fill = "steelblue") + # Use geom_bar with stat = "count" to count observations
labs(x = "Cluster Number", y="Points in Cluster", # Set labels
title = "Cluster Cardinality (k = 4)") +
theme(panel.grid.major = element_blank(), # Turn of the background grid
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank())
g_2
k_4_mag <- cbind.data.frame(c(1:2), fit_2$withinss) # Extract within cluster sum of squares
names(k_4_mag) <- c("cluster", "withinss") # Fix names for plot data
# Create bar plot
g_4 <- ggplot(k_4_mag, aes(x = cluster, y = withinss)) + # Set x as cluster, y as withinss
geom_bar(stat = "identity", fill = "steelblue") + # Use geom bar and stat = "identity" to plot values directly
labs(x = "Cluster Number", y="Total Point to Centroid Distance", # Set labels
title = "Cluster Magnitude (k = 4)") +
theme(panel.grid.major = element_blank(), # Turn of the background grid
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank())
g_4
k_4_dat <- cbind.data.frame(table(clusters_2), k_4_mag[,2]) # Join magnitude and cardinality
names(k_4_dat) <- c("cluster", "cardinality", "magnitude") # Fix plot data names # card = how many samples there are and magnitude = how much error there is in each cluster
# Create scatter plot
g_6 <- ggplot(k_4_dat, aes(x = cardinality, y = magnitude, color = cluster)) + # Set aesthetics
geom_point(alpha = 0.8, size = 4) + # Set geom point for scatter
geom_smooth(aes(x = cardinality, y = magnitude), method = "lm",
se = FALSE, inherit.aes = FALSE, alpha = 0.5) + # Set trend line
labs(x = "Cluster Cardinality", y="Total Point to Centroid Distance", # Set labels
title = "Cluster Magnitude vs Cardinality \n(k = 2)") +
theme(panel.grid.major = element_blank(), # Turn of the background grid
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank())
g_6
# Calculate distance between samples
dis = dist(sdata2)^2
# Set plotting parameters to view plot
op <- par(mfrow= c(1,1), oma= c(0,0, 3, 0),
mgp= c(1.6,.8,0), mar= .1+c(4,2,2,2))
# Create silhouette for k=4
#how similar sample is to others in its cluster/how dissimilar it is to vals in otherclusters
#want values to be as high as possible, if negative val more similar to vals in other clusters
sil = silhouette (clusters_2 , # Set clustering
dis, # Set distance
full = TRUE) # Generate silhouette for all samples
# Generate silhouette plot
plot(sil)
# Create silhouette plot summary
fviz_nbclust(sdata2[clusters_2 ==1,], # Set dataset
kmeans,# Set clustering method
method = "silhouette") # Set evaluation method
Smaller group of data from sillouhette plot
set.seed(12345) # Set seed for reproducibility
fit_3 <- kmeans(x = sdata2[clusters_2 == 1,], # Set data as explantory variables
centers = 2, # Set number of clusters
nstart = 25, # Set number of starts
iter.max = 100 ) # Set maximum number of iterations to use
# Extract clusters
clusters_3 <- fit_3$cluster
# Extract centers
centers_3 <- fit_3$centers
# Checksamples per cluster
summary(as.factor(clusters_3))
1 2
19 1
set.seed(12345) # Set seed for reproducibility
fit_4 <- kmeans(x = sdata3, # Set data as explantory variables
centers = 3, # Set number of clusters
nstart = 25, # Set number of starts
iter.max = 100 ) # Set maximum number of iterations to use
# Extract clusters
clusters_4 <- fit_4$cluster
# Extract centers
centers_4 <- fit_4$centers
# Checksamples per cluster
summary(as.factor(clusters_4))
1 2 3
2 14 3
Bigger group of data from silhouette plot
sdata5 <- sdata2[clusters_2 ==2,]
# Create silhouette plot summary
fviz_nbclust(sdata5, # Set dataset
kmeans,# Set clustering method
method = "silhouette") # Set evaluation method
set.seed(12345) # Set seed for reproducibility
fit_5 <- kmeans(x = sdata2[clusters_2 == 2,], # Set data as explantory variables
centers = 4, # Set number of clusters
nstart = 25, # Set number of starts
iter.max = 100 ) # Set maximum number of iterations to use
# Extract clusters
clusters_5 <- fit_5$cluster
# Extract centers
centers_5 <- fit_5$centers
# Checksamples per cluster
summary(as.factor(clusters_5))
1 2 3 4
8 12 67 40
# Create vector of clusters
cluster <- c(1: 4)
# Extract centers
center_df <- data.frame(cluster, centers_5)
# Reshape the data
center_reshape <- gather(center_df, features, values, Age..Years.:MFQ.Cut.off)
# View first few rows
head(center_reshape)
# Create plot
g_heat_1 <- ggplot(data = center_reshape, # Set dataset
aes(x = features, y = cluster, fill = values)) + # Set aesthetics
scale_y_continuous(breaks = seq(1, 5, by = 1)) + # Set y axis breaks
geom_tile() + # Geom tile for heatmap
coord_equal() + # Make scale the same for both axis
theme_set(theme_bw(base_size = 22) ) + # Set theme
scale_fill_gradient2(low = "blue", # Choose low color
mid = "white", # Choose mid color
high = "red", # Choose high color
midpoint =0, # Choose mid point
space = "Lab",
na.value ="grey", # Choose NA value
guide = "colourbar", # Set color bar
aesthetics = "fill") + # Select aesthetics to apply
coord_flip() # Rotate plot to view names clearly
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
# Generate plot
g_heat_1
# Create vector of clusters
cluster <- c(1: 4)
# Extract centers
center_df <- data.frame(cluster, centers_5)
# Reshape the data
center_reshape <- gather(center_df, features, values, PCS.1:PCS.22)
# View first few rows
head(center_reshape)
# Create plot
g_heat_1 <- ggplot(data = center_reshape, # Set dataset
aes(x = features, y = cluster, fill = values)) + # Set aesthetics
scale_y_continuous(breaks = seq(1, 5, by = 1)) + # Set y axis breaks
geom_tile() + # Geom tile for heatmap
coord_equal() + # Make scale the same for both axis
theme_set(theme_bw(base_size = 22) ) + # Set theme
scale_fill_gradient2(low = "blue", # Choose low color
mid = "white", # Choose mid color
high = "red", # Choose high color
midpoint =0, # Choose mid point
space = "Lab",
na.value ="grey", # Choose NA value
guide = "colourbar", # Set color bar
aesthetics = "fill") + # Select aesthetics to apply
coord_flip() # Rotate plot to view names clearly
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
# Generate plot
g_heat_1
# Create vector of clusters
cluster <- c(1: 4)
# Extract centers
center_df <- data.frame(cluster, centers_5)
# Reshape the data
center_reshape <- gather(center_df, features, values, MFQ.1:MFQ.33)
# View first few rows
head(center_reshape)
# Create plot
g_heat_1 <- ggplot(data = center_reshape, # Set dataset
aes(x = features, y = cluster, fill = values)) + # Set aesthetics
scale_y_continuous(breaks = seq(1, 5, by = 1)) + # Set y axis breaks
geom_tile() + # Geom tile for heatmap
coord_equal() + # Make scale the same for both axis
theme_set(theme_bw(base_size = 22) ) + # Set theme
scale_fill_gradient2(low = "blue", # Choose low color
mid = "white", # Choose mid color
high = "red", # Choose high color
midpoint =0, # Choose mid point
space = "Lab",
na.value ="grey", # Choose NA value
guide = "colourbar", # Set color bar
aesthetics = "fill") + # Select aesthetics to apply
coord_flip() # Rotate plot to view names clearly
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
# Generate plot
g_heat_1
plot_clust_cardinality <- as.data.frame(clusters_5)
names(plot_clust_cardinality) <- c("k_4") # Set names
# Create bar plots
g_6 <- ggplot(plot_clust_cardinality, aes( x = factor(k_4))) + # Set x as cluster values
geom_bar(stat = "count", fill = "steelblue") + # Use geom_bar with stat = "count" to count observations
labs(x = "Cluster Number", y="Points in Cluster", # Set labels
title = "Cluster Cardinality (k = 4)") +
theme(panel.grid.major = element_blank(), # Turn of the background grid
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank())
g_6
k_5_mag <- cbind.data.frame(c(1:4), fit_5$withinss) # Extract within cluster sum of squares
There were 27 warnings (use warnings() to see them)
names(k_5_mag) <- c("cluster", "withinss") # Fix names for plot data
# Create bar plot
g_7 <- ggplot(k_5_mag, aes(x = cluster, y = withinss)) + # Set x as cluster, y as withinss
geom_bar(stat = "identity", fill = "steelblue") + # Use geom bar and stat = "identity" to plot values directly
labs(x = "Cluster Number", y="Total Point to Centroid Distance", # Set labels
title = "Cluster Magnitude (k = 4)") +
theme(panel.grid.major = element_blank(), # Turn of the background grid
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank())
g_7
k_5_dat <- cbind.data.frame(table(clusters_5), k_5_mag[,2]) # Join magnitude and cardinality
names(k_5_dat) <- c("cluster", "cardinality", "magnitude") # Fix plot data names # card = how many samples there are and magnitude = how much error there is in each cluster
# Create scatter plot
g_8 <- ggplot(k_5_dat, aes(x = cardinality, y = magnitude, color = cluster)) + # Set aesthetics
geom_point(alpha = 0.8, size = 4) + # Set geom point for scatter
geom_smooth(aes(x = cardinality, y = magnitude), method = "lm",
se = FALSE, inherit.aes = FALSE, alpha = 0.5) + # Set trend line
labs(x = "Cluster Cardinality", y="Total Point to Centroid Distance", # Set labels
title = "Cluster Magnitude vs Cardinality \n(k = 4)") +
theme(panel.grid.major = element_blank(), # Turn of the background grid
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank())
g_8